dir = list.dirs("agneesh/all_data/kallisto_results/", full.names = T, recursive = F)
filename = file.path(dir, "abundance.tsv")
names <- data.frame(name = filename) %>% separate(name, sep = "//", into = c(NA,"test")) %>% separate(test, sep = "/", into = c("name", NA)) %>% pull(name)
pm <- tibble(target_id = character(), est_counts = numeric(), tpm = numeric(), library = character())
for (j in seq(filename)) pm <- rbind(pm, read_tsv(filename[j]) %>% select(target_id, est_counts, tpm) %>% mutate(library = names[j]))
genes <- read_tsv("ref/genes.txt", col_names = c("target_id", "gene"))
taiwan_venom <- read_rds("data/pm.RDS") %>% filter(grepl("Pm", library)) %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene) %>% summarize(tpm = sum(tpm))
taiwan_proteome <- read_csv("data/taiwan-secreted.csv", col_types = "-nc", col_names = c("gene", "type"))
rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan1"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan11"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(library = "taiwan3")) %>%
left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(library, gene) %>% summarize(tpm = sum(tpm), est_counts = sum(est_counts)) -> taiwan
averaged_data <- left_join(taiwan %>% filter(est_counts > 3) %>%
group_by(gene) %>% summarize(tpm = mean(tpm), est_counts = mean(est_counts)), taiwan_venom %>% group_by(gene) %>% summarize(tpm = mean(tpm)), by = "gene")
with(averaged_data, cor.test(tpm.x, tpm.y, method = "s"))
## Warning in cor.test.default(tpm.x, tpm.y, method = "s"): Cannot compute exact p-
## value with ties
##
## Spearman's rank correlation rho
##
## data: tpm.x and tpm.y
## S = 5.9513e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3908262
tp1 <- averaged_data %>% ggplot(aes(tpm.y, tpm.x, color = log10(est_counts +1 ))) + geom_point() + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) + scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland RNA (TPM)") + ylab("Venom RNA (TPM)") + labs(color=expression('Log'[10]*" counts")) + theme(legend.position = 'bottom')
tp1 #(supp fig 1)
## Warning: Transformation introduced infinite values in continuous x-axis
with(averaged_data %>% filter(gene %in% taiwan_proteome$gene), cor.test(tpm.x, tpm.y, method = "s"))
##
## Spearman's rank correlation rho
##
## data: tpm.x and tpm.y
## S = 21256, p-value = 1.218e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.628099
tp2 <- averaged_data %>% filter(gene %in% taiwan_proteome$gene) %>% ggplot(aes(tpm.y, tpm.x, color = log10(est_counts +1 ))) + geom_point() + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) + scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland RNA (TPM)") + ylab("Venom RNA (TPM)") + labs(color=expression('Log'[10]*" counts")) + theme(legend.position = 'bottom')
tp2 #(supp fig 2)
## Warning: Transformation introduced infinite values in continuous x-axis
taiwan_merged_venom <- left_join(filter(taiwan, gene %in% taiwan_proteome$gene), taiwan_venom, by = "gene")
taiwan_merged_venom %>% group_by(library) %>% summarize(cor = cor(tpm.x, tpm.y), method = "s")
## # A tibble: 4 x 3
## library cor[,1] method
## <chr> <dbl> <chr>
## 1 taiwan1 0.727 s
## 2 taiwan11 0.498 s
## 3 taiwan2 0.575 s
## 4 taiwan3 0.771 s
left_join(filter(taiwan, est_counts >= 1 & ! gene %in% taiwan_proteome$gene), taiwan_venom, by = "gene") %>% group_by(library) %>% summarize(cor = cor(tpm.x, tpm.y), method = "s")
## # A tibble: 4 x 3
## library cor[,1] method
## <chr> <dbl> <chr>
## 1 taiwan1 0.200 s
## 2 taiwan11 0.285 s
## 3 taiwan2 -0.000149 s
## 4 taiwan3 0.382 s
p1 <- taiwan_merged_venom %>% na.omit() %>% filter(est_counts >= 1) %>% ggplot(aes(tpm.y, tpm.x, color = library)) + geom_line(aes(group = gene), color = "grey") + geom_point() + scale_color_locuszoom()+
scale_y_log10(labels=trans_format('log10',math_format(10^.x))) +scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland gene expression (TPM)") + ylab("Venom mRNA level (TPM)") + guides(color = F) + ggtitle("Protobothrops mucrosquamatus") + theme(plot.title = element_text(face = "italic"))
p1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
Pm_tpm_exp<-rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan1"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(id="taiwan3"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan11")) %>%
dplyr::select(target_id,tpm,id) %>% spread(id,tpm)
pm<-readRDS("./data/pm.RDS")
pm<-pm %>% dplyr::select(-est_counts) %>% spread(library,tpm)
pm_tpm<-cbind(Pm_tpm_exp,pm[14:41])
pm_tpm %>% group_by(target_id) %>% summarise(vRNA = mean(taiwan1:taiwan3),vgRNA = mean(Pm_10:Pm_9)) %>% filter(vRNA > 2 & vgRNA >2)->model_dat
fit<-lm(log10(vgRNA)~log10(vRNA),data = model_dat)
summary(fit)
##
## Call:
## lm(formula = log10(vgRNA) ~ log10(vRNA), data = model_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10589 -0.34591 0.00843 0.32871 2.44065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.59561 0.01723 34.57 <2e-16 ***
## log10(vRNA) 0.51516 0.01056 48.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5151 on 5175 degrees of freedom
## Multiple R-squared: 0.3152, Adjusted R-squared: 0.315
## F-statistic: 2381 on 1 and 5175 DF, p-value: < 2.2e-16
Model properties
plot(fit)
ggplot(fit,aes(fit$residuals))+
geom_freqpoly()
model_dat %>%
ggplot(aes(x=log10(vgRNA), y=log10(vRNA)))+
geom_point()+
geom_smooth(method = "lm")
#model diagnostic plots how that the model has good properties
okinawa_old <- read_tsv("old/okinawa.txt") # protein data from Aird et al.
rbind(read_tsv("data/7-Okinawa-Male/abundance.tsv") %>% mutate(id = "okinawa7"),
read_tsv("data/8-Okinawa-Female/abundance.tsv") %>% mutate(id = "okinawa8"),
read_tsv("data/9-Okinawa-Male/abundance.tsv") %>% mutate(id = "okinawa9") ) %>%
left_join(read_tsv("data/okinawa_ref/abundance.tsv") %>% mutate(ref = tpm) %>% select(ref, target_id), by = c("target_id")) -> okinawa
cor.test(okinawa_old$fpkm, okinawa_old$`frag/len`) #what does this mean?
##
## Pearson's product-moment correlation
##
## data: okinawa_old$fpkm and okinawa_old$`frag/len`
## t = 2.6585, df = 102, p-value = 0.009114
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06516617 0.42625081
## sample estimates:
## cor
## 0.2545596
okinawa_merged <- left_join(okinawa_old, okinawa, by =c("Pf" = "target_id"))
okinawa_merged %>% group_by(id) %>% filter(est_counts >= 0) %>% summarize(cor = cor(tpm, `frag/len`), method = "s")
## # A tibble: 3 x 3
## id cor[,1] method
## <chr> <dbl> <chr>
## 1 okinawa7 0.480 s
## 2 okinawa8 0.406 s
## 3 okinawa9 0.429 s
cor.test(okinawa_merged$tpm, okinawa_merged$`frag/len`)
##
## Pearson's product-moment correlation
##
## data: okinawa_merged$tpm and okinawa_merged$`frag/len`
## t = 7.4364, df = 310, p-value = 1.02e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2905969 0.4794078
## sample estimates:
## cor
## 0.3890809
okinawa_merged %>% group_by(id) %>% filter(est_counts >= 0) %>% summarize(cor = cor(tpm, ref), method = "s")
## # A tibble: 3 x 3
## id cor[,1] method
## <chr> <dbl> <chr>
## 1 okinawa7 0.834 s
## 2 okinawa8 0.774 s
## 3 okinawa9 0.869 s
cor.test(okinawa_merged$tpm,okinawa_merged$ref)
##
## Pearson's product-moment correlation
##
## data: okinawa_merged$tpm and okinawa_merged$ref
## t = 17.756, df = 310, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6503164 0.7611081
## sample estimates:
## cor
## 0.71008
p2 <- okinawa_merged %>% filter(est_counts >= 1) %>% ggplot(aes(ref, tpm, color = id)) + geom_line(aes(group = Pf), color = "grey")+ geom_point() + scale_color_jco()+ scale_y_log10(labels=trans_format('log10',math_format(10^.x))) +scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland gene expression (TPM)") + ylab("Venom mRNA level (TPM)") + guides(color = F) + ggtitle("Protobothrops flavoviridis") + theme(plot.title = element_text(face = "italic"))
p2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
taiwan %>% filter(tpm > 1) %>%
select(-est_counts) %>%
mutate(tpm = log(1+tpm)) %>% pivot_wider(names_from = library, values_from = tpm) %>%
select(-taiwan2) %>%
filter(!if_any(everything(), is.na)) %>%
ggpairs(columns = 2:4)
dat<-pm_tpm %>% group_by(target_id) %>%
summarise(venom_gland=mean(Pm_10:Pm_9),venom_rna = mean(taiwan1:taiwan3)) %>%
filter(venom_gland >1,venom_rna>1) #filter to remove low exp genes hugging the axis to improve readability
toxins<-read.csv("./data/toxins.csv")
dat_toxins<-dat %>%filter(target_id %in% toxins$Transcripts)
toxins<-read.csv("./data/toxins.csv")
toxins<-rename(toxins, target_id=Transcripts)
allu_dat<-inner_join(dat_toxins,toxins,"target_id") %>% select(venom_gland, venom_rna, short) %>% rename(Toxin=short)
allu_dat<-allu_dat %>% gather(Toxin) %>% mutate(toxin = rep(allu_dat$Toxin,2)) %>% rename(tissue=Toxin)
toxins<-c("SVMP","SVSP","CTL","PLA2","LAAO","CRISP","VEGF","BPP","NGF")
allu_dat<-allu_dat %>% filter(toxin %in% toxins)
ggplot(allu_dat, aes(y= log(value),
axis1 = toxin,
axis2 = tissue))+
geom_alluvium(aes(fill = toxin))+
geom_stratum()+
geom_text(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("toxin", "tissue"),
expand = c(0.15, 0.05))+
scale_fill_viridis_d()+
theme_void()
RNA from the venom displays much more variance than the RNA from the venom gland after a variance stabilizing transformation. While this can be attributed to inter-individual variance, from this plot it seems that there is higher inter-individual variance in venom RNA than gland RNA. The discrepancy can also be due to the venom libraries being of lower quality. Either way, according to our data, we’d expect there to be higher variation in RNA from venom than from the gland.
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.0.5
library(vsn)
ten_counts<-v_Pm_counts<-rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan1"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(id="taiwan3"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan11")) %>%
dplyr::select(target_id,est_counts,id) %>% group_by(target_id) %>% summarise(mean_count = mean(est_counts)) %>% filter(mean_count >= 10) %>% select(target_id)
v_Pm_counts<-rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan1"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(id="taiwan3"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan11")) %>%
dplyr::select(target_id,est_counts,id) %>% spread(id,est_counts) %>% column_to_rownames("target_id") %>% round() %>%
rownames_to_column("target_id") %>% filter(target_id %in% ten_counts$target_id)
g<-tibble(id = c("taiwan1","taiwan11","taiwan2","taiwan3"))
dds_v <- DESeqDataSetFromMatrix(countData = v_Pm_counts,
colData = g,
design=~1, tidy = T)
dds_v <- DESeq(dds_v)
## Warning in DESeq(dds_v): the design is ~ 1 (just an intercept). is this
## intended?
vsd<-vst(dds_v)
pm<-readRDS("./data/pm.RDS")
vg_pm_counts<-pm %>% dplyr::select(-tpm) %>% spread(library,est_counts) %>% column_to_rownames("target_id") %>% round() %>%
rownames_to_column("target_id") %>% select(-c(P_m_01_heart,P_m_01_kidney,P_m_01_liver,
P_m_03_heart,P_m_03_kidney,P_m_03_liver,
P_m_05_heart,P_m_05_kidney,P_m_05_liver,
P_m_08_heart,P_m_08_kidney,P_m_08_liver)) %>% filter(target_id %in% ten_counts$target_id)
g<-tibble(id=colnames(vg_pm_counts[2:29]))
dds_vg <- DESeqDataSetFromMatrix(countData = vg_pm_counts,
colData = g,
design=~1, tidy = T)
dds_vg <- DESeq(dds_vg)
## Warning in DESeq(dds_vg): the design is ~ 1 (just an intercept). is this
## intended?
vsd_vg<-vst(dds_vg)
meanSdPlot(assay(vsd))
meanSdPlot(assay(vsd_vg))
taiwan_venom2 <- taiwan_venom <- read_rds("data/pm.RDS") %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene, library) %>% summarize(tpm = sum(tpm))
taiwan_merged2 <- rbind(taiwan %>% filter(tpm >= 2) %>% select(-est_counts), taiwan_venom2) %>%
pivot_wider(names_from = library, values_from = tpm)
d <- taiwan_merged2 %>% ungroup %>% select(-gene, -P_m_05_heart, -taiwan2)
d <- d[sort(rowSums(d, na.rm = T), decreasing = T, index.return = T)[[2]][1:5000],]
d <- log(d+1) %>% t() %>% dist()
fit <- MASS::isoMDS(d, k=2)
## initial value 24.091138
## final value 24.072651
## converged
plotDat <- data.frame(x = fit$points[,1], y = fit$points[,2], tissue = c(rep("venom",3), rep(c("heart", "kidney", "liver"),2), c("kidney", "liver"), c("heart", "kidney", "liver"), rep("venom gland", 28)) )
centroids <- aggregate(cbind(x,y) ~ tissue, plotDat, mean)
conf.rgn <- do.call(rbind, lapply(unique(plotDat$tissue), function(t)
data.frame(tissue = as.character(t),
ellipse(cov(plotDat[plotDat$tissue == t, 1:2]),
centre=as.matrix(centroids[centroids$tissue == t,2:3]),
level=0.95),
stringsAsFactors=FALSE)))
tp2 <- ggplot(plotDat, aes(x,y,color = tissue)) + geom_point(size=3) + theme_bw() + geom_path(data=conf.rgn) + xlab("NMDS Axis 1") + ylab("NMDS Axis 2")+ theme(legend.position = 'bottom')
tp2
#g <- grid.arrange(tp1, tp2, ncol = 1)
#ggsave(file="plots/Figure 2 taiwan.pdf", g, width = 5, height = 10)
As expected given the lower coverage the venom RNA nests with the venom gland samples
modules <- read_csv(file = "data/modules.csv")
taiwan_venom_matrix <- rbind(read_rds("data/pm.RDS") %>% filter(grepl("Pm", library)) %>% select(-est_counts),
rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan1"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan11"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(library = "taiwan3")) %>%
filter(est_counts > 1) %>% select(target_id, library, tpm) %>%
mutate(tpm = log(1+tpm)) %>% select(library, target_id, tpm)) %>%
pivot_wider(names_from = target_id, values_from = tpm)
taiwan_venom_matrix2 <- taiwan_venom_matrix[,intersect(modules$target_id, colnames(taiwan_venom_matrix)[colSums(is.na(taiwan_venom_matrix)) == 0])]
gsg <- goodSamplesGenes(taiwan_venom_matrix2)
gsg$allOK
goodModules <- modules %>% filter(target_id %in% colnames(taiwan_venom_matrix2)) %>% group_by(colors) %>% summarize(count = n() > 200) %>% filter(count == T) %>% pull(colors) # filter out small modules
keep <- intersect(modules %>% filter(colors %in% goodModules) %>% pull(target_id), colnames(taiwan_venom_matrix2))
setLabels <- c("old", "new");
multiExpr <- list(old = list(data = taiwan_venom_matrix2[1:28, keep]), new = list(data = taiwan_venom_matrix2[28:30, keep]))
multiColor <- list(old = modules %>% filter(target_id %in% keep) %>% pull(colors))
allowWGCNAThreads(10)
mp <- modulePreservation(multiExpr, multiColor,
parallelCalculation = T,
referenceNetworks = 1,
checkData = F,
nPermutations = 200,
randomSeed = 1,
verbose = 3)
mp<-readRDS("./data/module_preservation.rds")
mp$preservation$Z$ref.old$inColumnsAlsoPresentIn.new %>% mutate(p = 10^mp$preservation$log.p$ref.old$inColumnsAlsoPresentIn.new$log.psummary.pres) %>% select (moduleSize, Zsummary.pres, p)
## moduleSize Zsummary.pres p
## blue 681 -0.2798956 3.250740e-07
## cyan 256 3.3259614 1.148677e-05
## gold 1000 1.9461970 4.818050e-03
## greenyellow 478 4.9593123 2.755866e-07
## lightgreen 229 1.0653598 6.591700e-05
## purple 355 6.5677331 2.062959e-11
## red 264 3.0285454 5.141025e-04
## turquoise 950 3.5761832 4.191815e-05
#write_rds(mp, "data/module_preservation.rds")
The metavenom module is highly preserved in the venom RNA
#Human-habu_modules from orgin of specialisation study
habu_human_modules<-read_csv("./data/all_data_tpmKeep_modules.csv")
#get muscle module
habu_human_muscle<-habu_human_modules %>% filter(colors == "yellow")
#filter the muscle data in vRNA gRNA matrix
pm_tpm_muscle<-pm_tpm %>% filter(target_id %in% habu_human_muscle$habu_target_id)
dat_muscle<-pm_tpm_muscle %>% group_by(target_id) %>% summarise(venom_rna=mean(taiwan1:taiwan3),venom_gland=mean(Pm_10:Pm_9))
cor(dat_muscle$venom_rna,dat_muscle$venom_gland,method = 's')
## [1] 0.4764015
#lower correlation for muscle than venom
dat_muscle %>% ggplot()+
geom_point(aes(x=target_id, y=venom_rna,color="venom"))+
geom_point(aes(x=target_id, y=venom_gland,color="gland"))+
scale_color_manual(values = c("#94E11E","#6B1EE1"))+
scale_y_log10()+
ylab("RNA abundance (logTPM)")+
theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
#Most mucle abundance is in vgRNA. Comparatively, vRNA has lower amount of muscle contaminants.
taiwan_venom2 <- taiwan_venom <- read_rds("data/pm.RDS") %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene, library) %>% summarize(tpm = sum(tpm))
taiwan_merged2 <- rbind(taiwan %>% filter(tpm >= 2) %>% select(-est_counts), taiwan_venom2) %>%
pivot_wider(names_from = library, values_from = tpm)
taiwan_merged2<-taiwan_merged2 %>% filter(!gene %in% habu_human_muscle$habu_gene_id)
d <- taiwan_merged2 %>% ungroup %>% select(-gene, -P_m_05_heart, -taiwan2)
d <- d[sort(rowSums(d, na.rm = T), decreasing = T, index.return = T)[[2]][1:5000],]
d <- log(d+1) %>% t() %>% dist()
fit <- MASS::isoMDS(d, k=2)
## initial value 21.796921
## final value 21.777990
## converged
plotDat <- data.frame(x = fit$points[,1], y = fit$points[,2], tissue = c(rep("venom",3), rep(c("heart", "kidney", "liver"),2), c("kidney", "liver"), c("heart", "kidney", "liver"), rep("venom gland", 28)) )
centroids <- aggregate(cbind(x,y) ~ tissue, plotDat, mean)
conf.rgn <- do.call(rbind, lapply(unique(plotDat$tissue), function(t)
data.frame(tissue = as.character(t),
ellipse(cov(plotDat[plotDat$tissue == t, 1:2]),
centre=as.matrix(centroids[centroids$tissue == t,2:3]),
level=0.95),
stringsAsFactors=FALSE)))
tp2 <- ggplot(plotDat, aes(x,y,color = tissue)) + geom_point(size=3) + theme_bw() + geom_path(data=conf.rgn) + xlab("NMDS Axis 1") + ylab("NMDS Axis 2")+ theme(legend.position = 'bottom')
tp2